# A tool for randomly sampling data from different value intervals in a CSV file, creating balanced subsets for each range.

import pandas as pd
import numpy as np
import os

# Read CSV file
df = pd.read_csv(r"D:/***", header=None, names=['filename', 'value'])

intervals = [(0.000, 0.200), (0.200, 0.300), (0.300, 0.400), (0.400, 1)]


output_dir = r'D:/***'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Process each interval
for lower, upper in intervals:
    # Filter data within the interval
    df_interval = df[(df['value'] >= lower) & (df['value'] < upper)]


    if len(df_interval) < 1500: 
        continue

    df_sampled = df_interval.sample(n=1500, random_state=29)

    # Save to CSV file, filename is the interval range
    filename = f"{lower:.3f}_{upper:.3f}.csv"
    output_path = os.path.join(output_dir, filename)
    df_sampled.to_csv(output_path, index=False, header=False)

print("Task completed, all files have been saved to", output_dir)
